/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import mosdi.distributions.PoissonDistribution;
import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.ForwardPatternExpectationCalculator;
import mosdi.fa.GeneralizedString;
import mosdi.fa.IIDTextModel;
import mosdi.fa.MarkovianTextModel;
import mosdi.fa.PatternExpectationCalculator;
import mosdi.paa.ClumpSizeCalculator;
import mosdi.paa.MatchCountDAA;
import mosdi.paa.PAA;
import mosdi.paa.TextBasedPAA;
import mosdi.util.FileUtils;
import mosdi.util.Iupac;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class CompPoissonEvalSubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return 
		super.usage()+" [options] <iupac-pattern>\n" +
		"\n" +
		"Options:\n" +
		"  -F: read patterns from file\n" +
		"  -r: simultaneously consider reverse complementary motif\n" +
		"  -M <order>: Order of the Markov model. Default: 0 (i.i.d.)\n" +
		"  -t <fasta-file>: Estimate distribution from given sequence(s).\n" +
		"  -c <max-clump-size>: size of clump size distribution (default: 8)\n" +
		"  -n <number-of-steps>: the length of the random text (default: 500)\n" +
		"  -m <maxmatches>: maximal number of matches (default: 20)";
	}
	
	@Override
	public String description() {
		return "Compares compound Poisson approximation against exact results.";
	}

	@Override
	public String name() {
		return "comp-poisson-eval";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 1, "FrM:t:n:m:c:");

		// Option dependencies
		impliedOptions("M", "t");

		// Mandatory arguments
		String patternParam = getStringArgument(0);

		// Options
		int textModelOrder = getNonNegativeIntOption("M", 0);
		int textLength = getPositiveIntOption("n", 500);
		// maximal number of matches to be counted (size of distribution array)
		int maxMatches = getNonNegativeIntOption("m", 20);
		boolean considerReverse = getBooleanOption("r", false);
		int maxClumpSize = getPositiveIntOption("c", 8);
		String fastaFilename = getStringOption("t", null);

		Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		Alphabet iupacAlphabet = Alphabet.getIupacAlphabet();
		List<NamedSequence> modelEstimationSequences = null;
		FiniteMemoryTextModel textModel = null;

		// create list of patterns
		List<String> patternList = null;
		if (optionSet.has("F")) {
			// read patterns from file
			patternList = FileUtils.readPatternFile(patternParam);
		} else {
			patternList = new ArrayList<String>(1);
			patternList.add(patternParam);
		}
		
		if (fastaFilename!=null) {
			try {
				modelEstimationSequences = SequenceUtils.readFastaFile(fastaFilename, dnaAlphabet);
			} catch (Exception e) {
				Log.errorln(e.toString());
				System.exit(1);
			}
		}
	
		// create text model
		if (modelEstimationSequences==null) {
			textModel = new IIDTextModel(dnaAlphabet.size());
		} else {
			if (textModelOrder==0) {
				textModel = new IIDTextModel(dnaAlphabet.size(), SequenceUtils.sequenceList(modelEstimationSequences));
			} else {
				textModel = new MarkovianTextModel(textModelOrder, dnaAlphabet.size(), SequenceUtils.sequenceList(modelEstimationSequences));
			}
		}
		
		Log.println(Log.Level.STANDARD, "Format: >>stats>> pattern #generalized-strings #dfa-states #paa-states expectation expected-clump-size steps-to-convergence");
		for (String pattern : patternList) {
			Log.startTimer();
			
			// create CDFA
			List<GeneralizedString> genStringList = new ArrayList<GeneralizedString>(1);
			genStringList.add(Iupac.toGeneralizedString(pattern));
			if (considerReverse) {
				String reversePattern = Iupac.reverseComplementary(pattern);
				genStringList.add(Iupac.toGeneralizedString(reversePattern));
			}
	
			// create cdfa
			CDFA cdfa = DFAFactory.build(dnaAlphabet, genStringList, 50000);
			Log.printf(Log.Level.VERBOSE, "DFA states: %d%n", cdfa.getStateCount());
			cdfa = cdfa.minimizeHopcroft();
			Log.printf(Log.Level.VERBOSE, "DFA states (minimized): %d%n", cdfa.getStateCount());
			int statesCDFA = cdfa.getStateCount(); 
			
			// crate MAC 
			MatchCountDAA daa = new MatchCountDAA(cdfa, maxMatches);
			PAA paa = new TextBasedPAA(daa, textModel);
			Log.printf(Log.Level.VERBOSE, "PAA has %d states.%n", paa.getStateCount());
			int statesPAA = paa.getStateCount();
	
			Log.restartTimer("Construct PAA");
			double timeAutomaton = Log.getLastPeriodCpu();
			
			// calculate expectation
			double singleExpectation = 0.0;
			PatternExpectationCalculator pec = new ForwardPatternExpectationCalculator(textModel, Iupac.asGeneralizedAlphabet(), iupacAlphabet.buildIndexArray(pattern));
			singleExpectation += pec.getPrefixExpectation(pattern.length());
			if (considerReverse) {
				PatternExpectationCalculator rpec = new ForwardPatternExpectationCalculator(textModel, Iupac.asGeneralizedAlphabet(), iupacAlphabet.buildIndexArray(Iupac.reverseComplementary(pattern)));
				singleExpectation += rpec.getPrefixExpectation(pattern.length());
			}
			double expectation = singleExpectation * (textLength-pattern.length()+1);
	
			// calculate clump size distribution
			ClumpSizeCalculator csc = new ClumpSizeCalculator(textModel, cdfa, pattern.length());
			double[] clumpSizeDistribution = csc.clumpSizeDistribution(maxClumpSize, 1e-50);
			int stepsToConvergence = csc.stepsToConvergence();
			
			// calculate expected clump size
			double expectedClumpSize = 0.0;
			for (int i=1; i<clumpSizeDistribution.length; ++i) {
				expectedClumpSize+=clumpSizeDistribution[i]*i;
			}
	
			// calculate compound poisson approximation
			double lambda = expectation/expectedClumpSize;
			PoissonDistribution poissonDist = new PoissonDistribution(lambda);
			double[] cpDist = poissonDist.compoundPoissonDistribution(clumpSizeDistribution, maxMatches);
	
			Log.restartTimer("Compound Poisson approximation");
			double timeCompPoisson = Log.getLastPeriodCpu();
			
			// calculate exact distribution
			double[] exactDist = paa.computeValueDistribution(textLength);
	
			Log.stopTimer("Exact distribution");
			double timeExact = Log.getLastPeriodCpu();
			
			// calculate relative errors
			double[] relativeErrors = new double[maxMatches+1];
			for (int i=0; i<=maxMatches; ++i) relativeErrors[i]=Math.abs(cpDist[i]-exactDist[i])/exactDist[i];
	
			// calculate relative errors for log probs
			double[] relativeLogErrors = new double[maxMatches+1];
			for (int i=0; i<=maxMatches; ++i) relativeLogErrors[i]=Math.abs(Math.log(cpDist[i])-Math.log(exactDist[i]))/-Math.log(exactDist[i]);
	
			Log.printf(Log.Level.VERBOSE, "Exact clump size distribution: %s%n", Arrays.toString(clumpSizeDistribution));
			Log.printf(Log.Level.VERBOSE, "Expected clump size: %e%n", expectedClumpSize);
			Log.printf(Log.Level.VERBOSE, "Compound Poisson approximation of occurrence count dist: %s%n", Arrays.toString(cpDist));
			Log.printf(Log.Level.VERBOSE, "Exact occurrence count dist: %s%n", Arrays.toString(exactDist));
			Log.printf(Log.Level.VERBOSE, "Relative errors: %s%n", Arrays.toString(relativeErrors));
			
			StringBuilder sb = new StringBuilder();
			sb.append(String.format(">>stats>> %s %d %d %d %e %e %d ", pattern, genStringList.size(), statesCDFA, statesPAA, expectation, expectedClumpSize, stepsToConvergence));
			sb.append(Log.format(">>runtimes>> %t %t %t ", timeAutomaton, timeCompPoisson, timeExact));
			sb.append(">>exact_dist>> ");
			for (double d : exactDist) sb.append(String.format("%e ", d));
			sb.append(">>cp_dist>> ");
			for (double d : cpDist) sb.append(String.format("%e ", d));
			sb.append(">>rel_err>> ");
			for (double d : relativeErrors) sb.append(String.format("%e ", d));
			sb.append(">>rel_log_err>> ");
			for (double d : relativeLogErrors) sb.append(String.format("%e ", d));
			if (sb.length()>0) Log.println(Log.Level.STANDARD, sb.toString());
		}
		return 0;
	}
}
